In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
data = pd.read_csv("data/immobilier.csv")
In [3]:
data.shape
Out[3]:
In [4]:
data.head()
Out[4]:
On souhaite prédire la colonne "SalePrice". Donc toutes les autres colonnes sont des variables à faire apprendre
In [ ]:
In [ ]:
In [172]:
features = [col for col in data.columns if col not in "SalePrice"]
In [173]:
features
Out[173]:
In [174]:
train = data[features]
y = data.SalePrice
#y = data['SalePrice']
In [175]:
train.head()
Out[175]:
In [176]:
y.head()
Out[176]:
In [177]:
sns.distplot(y)
Out[177]:
In [131]:
# Modele pour la regression
from sklearn.linear_model import Ridge
In [49]:
import sklearn
In [50]:
sklearn.__version__
Out[50]:
In [181]:
# Initialisation du model
model_ridge = Ridge()
In [182]:
# 1) On fait apprendre le model
model_ridge.fit(train, y)
# Error ...
Le model peux prendre en entré que des chiffre, il faut donc transformer les données en string en chiffre
In [183]:
data['SaleCondition'].head()
Out[183]:
In [184]:
pd.get_dummies(data['SaleCondition'], prefix="SaleCondition").head()
Out[184]:
In [185]:
def prepare_data(data):
features = [col for col in data.columns if col not in "SalePrice"] # 80 col
train = data[features]
y = data.SalePrice
# Transform Object features to columns
train = pd.get_dummies(train)
return train, y
In [186]:
train, y = prepare_data(data.copy())
In [187]:
train.head()
Out[187]:
In [188]:
# 2) On fait apprendre le model
model_ridge.fit(train, y)
# Error ...
In [189]:
data.BsmtFinType2.value_counts(dropna=False)
Out[189]:
Il y a des données manquante qui ne peuvent pas etre prisent en compte par la modèle, il faut donc les remplacer
In [190]:
pd.isnull(data).sum()
Out[190]:
In [191]:
def prepare_data(data):
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
return train, y
In [192]:
train, y = prepare_data(data.copy())
In [193]:
# 3) On fait apprendre le model
model_ridge.fit(train, y)
# yeah !!!
Out[193]:
In [5]:
from sklearn.metrics import mean_absolute_error
In [195]:
vrai = np.array([1000, 2000, 1500])
prediction = np.array([900, 2200, 1300]) # classic
#prediction = np.array([990, 2005, 1500]) # Best
#prediction = np.array([9000, 22000, 13000]) # Bad
In [196]:
mean_absolute_error(vrai, prediction)
Out[196]:
In [197]:
1000 - 900
Out[197]:
In [198]:
2000 - 2200
Out[198]:
In [199]:
1500 - 1300
Out[199]:
In [200]:
(100 + 200 + 200) / 3.0
Out[200]:
In [6]:
from sklearn.model_selection import cross_val_score
In [7]:
def cross_validation(model, train, y, cv=5):
mae = -cross_val_score(model, train, y, scoring="neg_mean_absolute_error", cv = cv)
return mae
In [203]:
score = cross_validation(model_ridge, train, y)
print score
In [204]:
score.mean(), score.std()
Out[204]:
In [205]:
data.SalePrice.describe()
Out[205]:
Voila notre 1er score !!
In [206]:
preds = pd.DataFrame({"preds":model_ridge.predict(train), "true":y})
preds["residuals"] = np.abs(preds["true"] - preds["preds"])
preds.plot(x = "preds", y = "residuals",kind = "scatter")
Out[206]:
In [207]:
preds[preds.residuals >150000]
Out[207]:
In [ ]:
In [ ]:
In [208]:
data.shape
Out[208]:
In [209]:
def prepare_data_outlier(data):
features = [col for col in data.columns if col not in "SalePrice"]
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
train = data[features]
y = data.SalePrice
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
return train, y
In [210]:
train, y = prepare_data_outlier(data.copy())
print train.shape, y.shape
In [211]:
score = cross_validation(model_ridge, train, y)
print score.mean()
In [213]:
train, y = prepare_data(data.copy())
print train.shape, y.shape
In [214]:
score = cross_validation(model_ridge, train, y)
print score.mean()
In [ ]:
In [ ]:
In [8]:
from sklearn.model_selection import train_test_split
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [216]:
print"X_train : " + str(X_train.shape)
print"X_validation : " + str(X_validation.shape)
print"y_train : " + str(y_train.shape)
print"y_validation : " + str(y_validation.shape)
In [217]:
model_ridge.fit(X_train, y_train)
Out[217]:
In [218]:
mes_predictions = model_ridge.predict(X_validation)
In [219]:
# Mes prédiction
mes_predictions[0:5]
Out[219]:
In [220]:
# Les vrai valeurs
y_validation[0:5]
Out[220]:
In [221]:
mean_absolute_error(y_validation, mes_predictions)
Out[221]:
In [222]:
plt.scatter(mes_predictions, y_validation)
plt.plot([min(mes_predictions),max(mes_predictions)], [min(mes_predictions),max(mes_predictions)], c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[222]:
Une valeur semble complétement perdu; on prédit 900.000 alors qu'elle devrait etre à moins de 200.000 ...
In [223]:
analyse = X_validation.copy()
In [224]:
analyse.head()
Out[224]:
In [225]:
analyse['prix'] = y_validation
In [226]:
analyse.head()
Out[226]:
In [227]:
analyse['prediction'] = mes_predictions
In [228]:
analyse.head()
Out[228]:
In [229]:
analyse[analyse.prediction >= 800000]
Out[229]:
In [230]:
sns.countplot(data.SaleCondition)
Out[230]:
In [231]:
sns.distplot(data.SalePrice)
Out[231]:
In [232]:
data.SalePrice.describe()
Out[232]:
In [233]:
sns.distplot(np.log1p(data.SalePrice))
Out[233]:
In [234]:
np.log1p(data.SalePrice).describe()
Out[234]:
In [235]:
def prepare_data_log(data):
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
return train, y
In [236]:
train, y = prepare_data_log(data)
In [237]:
score = cross_validation(model_ridge, train, y)
print score.mean()
In [238]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [239]:
model_ridge.fit(X_train, y_train)
Out[239]:
In [240]:
mes_predictions = model_ridge.predict(X_validation)
In [241]:
mes_predictions[0:5]
Out[241]:
In [242]:
# Les vrai valeurs
y_validation[0:5]
Out[242]:
In [243]:
mean_absolute_error(y_validation, mes_predictions)
Out[243]:
In [ ]:
In [244]:
mes_predictions_exp = np.expm1(mes_predictions)
y_validation_exp = np.expm1(y_validation)
In [245]:
# Redonner les valeurs un transformation normal (exp)
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[245]:
In [ ]:
In [246]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[246]:
In [247]:
def prepare_data_outlier_log(data):
features = [col for col in data.columns if col not in "SalePrice"]
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
return train, y
In [248]:
train, y = prepare_data_outlier_log(data.copy())
In [249]:
score = cross_validation(model_ridge, train, y)
print score.mean()
In [250]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [251]:
model_ridge.fit(X_train, y_train)
Out[251]:
In [252]:
mes_predictions = model_ridge.predict(X_validation)
In [253]:
mes_predictions[0:5]
Out[253]:
In [254]:
# Les vrai valeurs
y_validation[0:5]
Out[254]:
In [255]:
mes_predictions_exp = np.expm1(mes_predictions)
y_validation_exp = np.expm1(y_validation)
In [256]:
# Redonner les valeurs un transformation normal (exp)
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[256]:
In [257]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[257]:
In [130]:
model_ridge = Ridge()
In [259]:
model_ridge
Out[259]:
In [260]:
#alphas = [0.05, 0.1, 0.3, 1, 3, 5, 10, 15, 30, 50, 75]
alphas = [10, 10.5, 11, 11.5, 12, 12.5, 13, 13.5, 14, 14.5, 15, 15.5, 16]
cv_ridge = [cross_validation(Ridge(alpha = alpha ,random_state=42), train, y).mean()
for alpha in alphas]
In [261]:
cv_ridge = pd.Series(cv_ridge, index = alphas)
cv_ridge.plot()
plt.xlabel("alpha")
plt.ylabel("mean absolute error")
Out[261]:
In [262]:
cv_ridge.argmin()
Out[262]:
In [263]:
cv_ridge
Out[263]:
In [265]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [266]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [267]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train, y_train)
Out[267]:
In [268]:
mes_predictions_exp = np.expm1(model_ridge.predict(X_validation))
In [269]:
y_validation_exp = np.expm1(y_validation)
In [270]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[270]:
In [271]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[271]:
In [104]:
data.plot(kind='scatter', x="1stFlrSF", y='SalePrice')
Out[104]:
In [276]:
data.plot(kind='scatter', x="2ndFlrSF", y='SalePrice')
Out[276]:
In [277]:
data['1stFlr_2ndFlr_Sf'] = data['1stFlrSF'] + data['2ndFlrSF']
In [278]:
data.plot(kind='scatter', x="1stFlr_2ndFlr_Sf", y='SalePrice')
Out[278]:
In [280]:
sns.distplot(np.log1p(data['1stFlr_2ndFlr_Sf']))
Out[280]:
In [108]:
data[(data['1stFlr_2ndFlr_Sf'] > 4000) & (data.SalePrice <= 700000)]
Out[108]:
In [320]:
def prepare_data_outlier_log_plus(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
# Ajout de nouvelle variables
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [321]:
train, y = prepare_data_outlier_log_plus(data.copy())
In [322]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [323]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [324]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train, y_train)
Out[324]:
In [325]:
mes_predictions_exp = np.expm1(model_ridge.predict(X_validation))
In [326]:
y_validation_exp = np.expm1(y_validation)
In [327]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[327]:
In [328]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[328]:
In [329]:
model_ridge.coef_[0:10]
Out[329]:
In [330]:
coef = pd.Series(model_ridge.coef_, index = X_train.columns)
# On prend les 10 plus important features postive et négative
nb_important = 25
imp_coef = pd.concat([coef.sort_values().head(nb_important),
coef.sort_values().tail(nb_important)])
imp_coef.plot(kind = "barh", figsize=(10, 8))
plt.title("Coefficients in Model")
Out[330]:
In [61]:
# Pour afficher des images (pas besoin de taper cet import)
from IPython.display import Image
In [45]:
data[['YearBuilt', 'GarageYrBlt']].head()
Out[45]:
In [46]:
df = data.copy() # To work on df with no change in Daframe data
In [47]:
df['build_home_garage_same_year'] = 0
df.loc[data['YearBuilt'] == data['GarageYrBlt'], 'build_home_garage_same_year'] = 1
In [48]:
df.build_home_garage_same_year.value_counts()
Out[48]:
In [297]:
def prepare_data_outlier_log_plus_2(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data['build_home_garage_same_year'] = "N"
data.loc[data['YearBuilt'] == data['GarageYrBlt'], 'build_home_garage_same_year'] = "Y"
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [298]:
train.head()
Out[298]:
In [50]:
data.shape
Out[50]:
In [317]:
train, y = prepare_data_outlier_log_plus_2(data.copy())
In [318]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [35]:
# Best was 0.0794542370234 donc ce n'est pas positif comme features
Toujours tester un ajout de features pour savoir si celle-ci va avoir un impact positif ou négatif
In [309]:
df.MasVnrType.value_counts()
Out[309]:
In [310]:
df.MasVnrType.head()
Out[310]:
In [357]:
#df.MasVnrArea.value_counts()
In [312]:
df.shape
Out[312]:
In [313]:
df[df.MasVnrType == "None"].MasVnrArea.value_counts()
Out[313]:
In [331]:
def prepare_data_outlier_log_plus_3(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [ ]:
In [332]:
train, y = prepare_data_outlier_log_plus_3(data.copy())
In [333]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [64]:
Image(url="http://i.giphy.com/GPq3wxmLbwUGA.gif")
Out[64]:
In [341]:
df.BsmtFinType2.value_counts(dropna=False)
Out[341]:
In [343]:
df.BsmtFinSF2.describe()
Out[343]:
In [344]:
df[pd.isnull(df.BsmtFinType2)].BsmtFinSF2.value_counts()
Out[344]:
In [354]:
def prepare_data_outlier_log_plus_4(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [355]:
train, y = prepare_data_outlier_log_plus_4(data.copy())
In [356]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [230]:
X_train_ridge, X_validation_ridge, y_train_ridge, y_validation_ridge = train_test_split(train, y, random_state = 3)
In [231]:
model_ridge = Ridge(alpha=13.5, random_state=42)
model_ridge.fit(X_train_ridge, y_train_ridge)
Out[231]:
In [137]:
coef = pd.Series(np.abs(model_ridge.coef_), index = X_train.columns)
# On prend les 10 plus important features postive et négative
nb_important = 15
#imp_coef = pd.concat([coef.sort_values().head(nb_important),
# coef.sort_values().tail(nb_important)])
imp_coef = coef.sort_values().head(nb_important)
imp_coef.plot(kind = "barh", figsize=(10, 8))
plt.title("Coefficients in Model")
Out[137]:
In [367]:
coef.sort_values().head(10)
Out[367]:
In [368]:
features_to_delete = ["GarageCond_Ex",
"Condition2_RRAe",
"Exterior1st_Stone",
"MiscFeature_TenC",
"MiscVal",
"BsmtUnfSF",
"LotArea",
"MasVnrArea",
"GarageYrBlt",
"Id"]
In [369]:
def prepare_data_outlier_log_plus_4_bis(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
train = train.drop(features_to_delete, axis=1)
print train.shape
return train, y
In [370]:
train, y = prepare_data_outlier_log_plus_4_bis(data.copy())
In [371]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [375]:
#pd.isnull(df).sum()
In [376]:
df.shape
Out[376]:
In [122]:
column_detail = pd.DataFrame(pd.isnull(df).sum(), columns=['nbr_null'])
column_detail.sort_values('nbr_null', ascending=0, inplace=True)
column_detail.head(10)
Out[122]:
In [227]:
def prepare_data_outlier_log_plus_5(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
#Drop features with too much Null value
data = data.drop('PoolQC', axis=1)
data = data.drop('MiscFeature', axis=1)
data = data.drop('Alley', axis=1)
data = data.drop('Fence', axis=1)
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# Transform Object features to columns
train = pd.get_dummies(train)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [228]:
train, y = prepare_data_outlier_log_plus_5(data.copy())
In [134]:
score = cross_validation(Ridge(alpha=13.5, random_state=42), train, y)
print score.mean()
In [380]:
Image(url="http://i.giphy.com/LZfZXcFNOOzw4.gif")
Out[380]:
In [73]:
from sklearn.tree import DecisionTreeRegressor
In [96]:
np.random.seed(42)
In [74]:
dt = DecisionTreeRegressor(random_state=0)
In [75]:
def dt_prepare_data(data):
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [76]:
train, y = dt_prepare_data(data.copy())
In [77]:
train.head()
Out[77]:
In [78]:
# 1) On fait apprendre le model
dt.fit(train, y)
#Error...
In [97]:
from sklearn.preprocessing import LabelEncoder
Problème avec nos données en string...
In [98]:
categoricals = [x for x in data.columns if data[x].dtype == 'object']
In [99]:
categoricals
Out[99]:
In [100]:
data.SaleCondition.head()
Out[100]:
In [101]:
lbl = LabelEncoder() # Initialisation
lbl.fit(data['SaleCondition'].values)
test = lbl.transform(data['SaleCondition'].values)
In [102]:
test[0:5]
Out[102]:
On change chaque valeur en string en valeur numérique
In [103]:
def dt_prepare_data_plus(data):
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# String problem
categoricals = [x for x in train.columns if train[x].dtype == 'object']
for col in categoricals:
lbl = LabelEncoder()
lbl.fit(train[col].values)
train[col] = lbl.transform(train[col].values)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [104]:
train, y = dt_prepare_data_plus(data.copy())
In [105]:
train.head()
Out[105]:
In [106]:
data.head()
Out[106]:
In [108]:
score = cross_validation(dt, train, y)
print score.mean()
In [109]:
def dt_prepare_data_plus_log(data):
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# String problem
categoricals = [x for x in train.columns if train[x].dtype == 'object']
for col in categoricals:
lbl = LabelEncoder()
lbl.fit(train[col].values)
train[col] = lbl.transform(train[col].values)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [110]:
train, y = dt_prepare_data_plus_log(data.copy())
In [112]:
score = cross_validation(dt, train, y)
print score.mean()
In [113]:
from sklearn.ensemble import RandomForestRegressor
In [114]:
rfr = RandomForestRegressor(random_state=0)
In [115]:
score = cross_validation(rfr, train, y)
print score.mean()
In [149]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [150]:
rfr.fit(X_train, y_train)
Out[150]:
In [151]:
mes_predictions_exp = np.expm1(rfr.predict(X_validation))
In [152]:
mes_predictions_exp[0:5]
Out[152]:
In [36]:
y_validation_exp = np.exp(y_validation)
In [122]:
y_validation_exp[0:5]
Out[122]:
In [123]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[123]:
In [124]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[124]:
In [147]:
def dt_prepare_data_plus_log_1(data):
#on enleve les id qui sont trop extreme
data = data.drop(data.index[[523,898, 1298]])
data['1stFlr_2ndFlr_Sf'] = np.log1p(data['1stFlrSF'] + data['2ndFlrSF'])
data.loc[data.MasVnrType == 'None', 'MasVnrArea'] = 0
data.loc[pd.isnull(data.BsmtFinType2), 'BsmtFinSF2'] = 0
features = [col for col in data.columns if col not in "SalePrice"]
train = data[features]
y = data.SalePrice
# Transforme log
y = np.log1p(y)
# String problem
categoricals = [x for x in train.columns if train[x].dtype == 'object']
for col in categoricals:
lbl = LabelEncoder()
lbl.fit(train[col].values)
train[col] = lbl.transform(train[col].values)
# Replace Nan value by mean of the column
train = train.fillna(train.mean())
print train.shape
return train, y
In [148]:
train, y = dt_prepare_data_plus_log_1(data.copy())
In [127]:
score = cross_validation(rfr, train, y)
print score.mean()
In [160]:
pd.DataFrame?
In [165]:
coef = pd.DataFrame({'col' : X_train.columns,'importance' : rfr.feature_importances_})
coef = coef.sort_values('importance', ascending=False)
top_tree_features = coef.col.head(25)
#plt.figure(figsize=(10, 5))
#coef.head(25).plot(kind='bar')
#plt.title('Feature Significance')
In [167]:
top_tree_features
Out[167]:
In [173]:
coef_ridge = pd.DataFrame({'col' : X_train.columns,
'importance' : model_ridge.coef_})
coef_ridge[coef_ridge.col.isin(list(top_tree_features))].shape
#imp_coef = coef.sort_values().head(nb_important)
#imp_coef.plot(kind = "barh", figsize=(10, 8))
#plt.title("Coefficients in Model")
Out[173]:
In [175]:
coef_ridge.tail()
Out[175]:
In [183]:
rfr = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
rfr
Out[183]:
In [184]:
score = cross_validation(rfr, train, y)
print score.mean()
In [185]:
RandomForestRegressor?
In [204]:
cv_rfr = []
n_estimators = [10, 50, 100, 200]
max_depths = [3, 5, 7]
for n_estimator in n_estimators:
for max_depth in max_depths:
print "Je lance n_estimator : " + str(n_estimator) + " et "+str(max_depth) + " max_depth."
score = cross_validation(RandomForestRegressor(n_estimators=n_estimator,
max_depth=max_depth,
random_state=0), train, y).mean()
cv_rfr.append({'n_estimator' : n_estimator,
'max_depths' : max_depth,
'score' : score})
In [205]:
cv_rfr_df = pd.DataFrame(cv_rfr)
In [206]:
cv_rfr_df
Out[206]:
In [212]:
from sklearn.model_selection import GridSearchCV
param_grid = { "n_estimators" : [250, 300],
"max_depth" : [3, 5, 7, 9]}
#grid_search = GridSearchCV(rfr, param_grid, n_jobs=-1, cv=5)
In [213]:
grid_search = GridSearchCV(rfr,
param_grid,
n_jobs=-1,
cv=5,
scoring='neg_mean_absolute_error')
In [214]:
grid_search.fit(train, y)
#print grid_search.best_params_
Out[214]:
In [215]:
grid_search.grid_scores_
Out[215]:
In [216]:
print grid_search.best_params_
In [217]:
rfr = RandomForestRegressor(n_estimators=300, max_depth=9
, random_state=0)
In [219]:
X_train, X_validation, y_train, y_validation = train_test_split(train, y, random_state = 3)
In [221]:
rfr.fit(X_train, y_train)
Out[221]:
In [222]:
mes_predictions_exp = np.expm1(rfr.predict(X_validation))
In [223]:
mes_predictions_exp[0:5]
Out[223]:
In [224]:
y_validation_exp = np.exp(y_validation)
y_validation_exp[0:5]
Out[224]:
In [225]:
mean_absolute_error(y_validation_exp, mes_predictions_exp)
Out[225]:
In [226]:
plt.scatter(mes_predictions_exp, y_validation_exp)
plt.plot([min(mes_predictions_exp),max(mes_predictions_exp)], [min(mes_predictions_exp),max(mes_predictions_exp)]
, c="red")
plt.xlabel('Mes predicitons')
plt.ylabel('Vrai valeurs')
Out[226]:
In [ ]:
model_ridge = Ridge(alpha=13.5, random_state=42)
In [233]:
mes_predictions_ridge = np.expm1(model_ridge.predict(X_validation_ridge))
In [234]:
mes_predictions_ridge[0:5]
Out[234]:
In [235]:
mes_predictions_exp[0:5]
Out[235]:
In [237]:
resultat = pd.DataFrame({'ridge' : mes_predictions_ridge,
'tree' : mes_predictions_exp,
'realite' : y_validation_exp})
In [239]:
resultat['moyenne'] = (resultat.ridge+ resultat.tree) / 2.0
resultat.head()
Out[239]:
In [240]:
mean_absolute_error(resultat.realite, resultat.ridge)
Out[240]:
In [241]:
mean_absolute_error(resultat.realite, resultat.tree)
Out[241]:
In [242]:
mean_absolute_error(resultat.realite, resultat.moyenne)
Out[242]:
In [59]:
coef = pd.Series(rfr.feature_importances_, index = X_train.columns).sort_values(ascending=False)
plt.figure(figsize=(10, 5))
coef.head(25).plot(kind='bar')
plt.title('Feature Significance')
Out[59]:
In [ ]: